Methylome Atlas of Acute Leukemia#

Load data#

Hide code cell source
import pandas as pd
from source.pacmap_functions import *

input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'

# read df_discovery and df_validation
df_discovery = pd.read_pickle(
    input_path+'df_discovery.pkl').sort_index()

df_validation = pd.read_pickle(
    input_path+'df_validation.pkl').sort_index()

# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
                                      low_memory=False, index_col=0)

# Load clinical data
validation_clinical_data = pd.read_csv(input_path+'validation_clinical_data.csv',
                                        low_memory=False, index_col=0)

# Adjust clinical data
discovery_clinical_data['Train Test'] = 'Discovery (train) Samples'
validation_clinical_data['Train Test'] = 'Validation (test) Samples'

discovery_clinical_data['PaCMAP Output'] = 'Patient Samples'
validation_clinical_data['PaCMAP Output'] = 'Patient Samples'

discovery_clinical_data['Batch'] = df_discovery['Batch']
validation_clinical_data['Batch'] = 'St Jude Children\'s'

# use overlapping features between df_discovery and df_validation
common_features = [x for x in df_discovery.columns if x in df_validation.columns]

# apply `common_features` to both df_discovery and df_validation
df_discovery = df_discovery[common_features]
df_validation = df_validation[common_features]

print(
f' Discovery dataset (df_discovery) contains {df_discovery.shape[1]} \
columns (5mC nucleotides/probes) and {df_discovery.shape[0]} rows (samples).')

print(
f' Validation dataset (df_validation) contains {df_validation.shape[1]} \
columns (5mC nucleotides/probes) and {df_validation.shape[0]} rows (samples).')

output_notebook()

# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'
 Discovery dataset (df_discovery) contains 319738 columns (5mC nucleotides/probes) and 3330 rows (samples).
 Validation dataset (df_validation) contains 319738 columns (5mC nucleotides/probes) and 201 rows (samples).
Loading BokehJS ...

The Methylome Atlas of Acute Leukemia#

Hide code cell source
clinical_trials = ['NOPHO ALL92-2000', 
                    'AAML0531',
                    'AAML1031',
                    'Beat AML Consortium',
                    'TCGA AML',
                    'CETLAM SMD-09 (MDS-tAML)',
                    'French GRAALL 2003–2005',
                    'TARGET ALL',
                    'AAML03P1',
                    'Japanese AML05',
                    'CCG2961']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
                'Bone Marrow Normal','Primary Blood Derived Cancer - Peripheral Blood',
                'Blood Derived Normal','Likely Diagnosis', 'Control (Healthy Donor)',
                'Relapse','Recurrent Blood Derived Cancer - Bone Marrow',
                'Recurrent Blood Derived Cancer - Peripheral Blood',
                'Peripheral Blood Normal']

cols = ['Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Train Test', 'Batch', 'Hematopoietic Lineage']

# components = [2,5]
# for n in components:
#     processor = DataProcessor(discovery_clinical_data.copy(),
#                               df_discovery,
#                               clinical_trials,
#                               sample_types,
#                               cols, 
#                               n_components=n,
#                               common_prefix=output_path+f'pacmap_output/pacmap_{n}d_model_acute_leukemia', 
#                               df_test=df_validation.copy(),
#                               test_clinical_data=validation_clinical_data.copy())
    
#     processor.filter_data()
#     processor.apply_pacmap() # learn PaCMAP on the training data
#     processor.apply_pacmap_test() # apply PaCMAP to the test data
#     processor.join_labels()

#     # Save output
#     processor.df.to_csv(output_path+f'pacmap_output/pacmap_{n}d_output_acute_leukemia.csv')


df = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_acute_leukemia.csv', index_col=0)
Hide code cell source
# Concatenate discovery and validation clinical data
clinical_data = pd.concat([discovery_clinical_data, validation_clinical_data]).loc[df['index']]

# Select columns to plot
cols = ['PaCMAP Output','Hematopoietic Lineage','WHO 2022 Diagnosis','WHO AML 2022 Diagnosis',
        'WHO ALL 2022 Diagnosis','ELN AML 2022 Diagnosis','Age (group years)', 'Batch', 'Sex',
        'Clinical Trial', 'Sample Type', 'Train Test']

# Join clinical data to the embedding
df = df.join(clinical_data[cols], rsuffix='_copy', on='index')

# Call the BokehPlotter class to plot the data
plotter = BokehPlotter(df, cols, get_custom_color_palette(), 
                       title='The Methylome Atlas of Acute Leukemia',
                       x_range=(-40, 40), y_range=(-50, 50),
                       datapoint_size=3)
plotter.plot()

Compact version#

Hide code cell source
# Call the BokehPlotter class to plot the data
plotter = BokehPlotter(df, cols, get_custom_color_palette(), 
                       title='The Methylome Atlas of Acute Leukemia',
                       x_range=(-40, 40), y_range=(-50, 50),
                       datapoint_size=3, width=900, height=400)
plotter.plot()

Supplemental 1. The Methylome Atlas of Pediatric AML#

Hide code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
                'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']

cols = ['Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Train Test', 'Batch']

# components = [2,5]
# for n in components:
#     processor = DataProcessor(discovery_clinical_data.copy(),
#                               df_discovery,
#                               clinical_trials,
#                               sample_types,
#                               cols, 
#                               n_components=n,
#                               common_prefix=output_path+f'pacmap_output/pacmap_{n}d_model_peds_dx_aml', 
#                               df_test=df_validation.copy(),
#                               test_clinical_data=validation_clinical_data.copy())
    
#     processor.filter_data()
#     processor.apply_pacmap() # learn PaCMAP on the training data
#     processor.apply_pacmap_test() # apply PaCMAP to the test data
#     processor.join_labels() # join clinical data to the embedding

#     # Save output
#     processor.df.to_csv(output_path+f'pacmap_output/pacmap_{n}d_model_peds_dx_aml.csv')
Hide code cell source
df2 = pd.read_csv(output_path+'pacmap_output/pacmap_2d_model_peds_dx_aml.csv', index_col=0)

# Select columns to plot
cols = ['PaCMAP Output','Hematopoietic Lineage','WHO AML 2022 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
        'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
        'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group',
        'Clinical Trial', 'Vital Status','First Event','Sample Type', 'Train Test']

# Join clinical data to the embedding
df2 = df2.join(clinical_data[cols], rsuffix='_copy', on='index')

plotter = BokehPlotter(df2, cols, get_custom_color_palette(),
                       title='The Methylome Atlas of Pediatric AML',
                        x_range=(-45, 45), y_range=(-45, 45),
                        datapoint_size=4, tooltip_dx_cols='ELN AML 2022 Diagnosis')
plotter.plot()

Watermark#

Author: Francisco_Marchi@Lamba_Lab_UF

Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.12.2

numpy  : 1.24.3
pandas : 2.0.2
bokeh  : 3.1.1
pacmap : 0.7.0
itables: 1.5.2

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.90.1-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 20
Architecture: 64bit